I reproduced the ScRNA analysis for a Nat Comm paper: https://www.nature.com/articles/s41467-020-14296-y#Sec2

Wang, X., Xu, H., Cheng, C., Ji, Z., Zhao, H., Sheng, Y., … & Zhu, H. H. (2020). Identification of a Zeb1 expressing basal stem cell subpopulation in the prostate. Nature communications, 11(1), 1-16.

Here’s my modified and updated script to reproduce the works.

#setwd("R_HelenHeZhu_StemCell")

###############################################################################
### Step01 removal of poor quality cells and contaminated non-epithelial cells 
###############################################################################

library(Seurat)
library(dplyr)
library(Matrix)
library(ggpubr)
#

data <- Read10X(data.dir = "GSE111429_RAW")
PG_all    <- CreateSeuratObject(count = data, min.cells = 0, min.genes = 0, project = "PG")
PG_all <- PercentageFeatureSet(PG_all, pattern = "^mt-", col.name = "percent.mt")
before <- VlnPlot(PG_all, features =c('nFeature_RNA','nCount_RNA','percent.mt'))

# Poor-quality cells with less than 1000 genes detected, less than 5000 UMIs or more than 5% UMI mapped to mitochondria genes were removed. 
PG_all <- subset(PG_all, subset = nFeature_RNA > 1400 & nCount_RNA > 5000 & percent.mt < 5)
after <- VlnPlot(PG_all, features =c('nFeature_RNA','nCount_RNA','percent.mt'))
ggarrange(before,after, ncol = 2, nrow = 1)

#
PG_all <- NormalizeData(PG_all, normalization.method='LogNormalize', scale.factor=1e4)
PG_all <- FindVariableFeatures(PG_all,
                               nfeatures = 1606,
                               mean.cutoff = c(0.0125,3),
                               dispersion.cutoff = c(0.5, Inf))
length(PG_all@assays$RNA@var.features)
## [1] 1606
summary(PG_all@assays$RNA@meta.features)
##     vst.mean         vst.variance      vst.variance.expected
##  Min.   :  0.0000   Min.   :    0.00   Min.   :     0.00    
##  1st Qu.:  0.0000   1st Qu.:    0.00   1st Qu.:     0.00    
##  Median :  0.0047   Median :    0.01   Median :     0.01    
##  Mean   :  0.4062   Mean   :    9.40   Mean   :    10.72    
##  3rd Qu.:  0.1242   3rd Qu.:    0.16   3rd Qu.:     0.16    
##  Max.   :488.0069   Max.   :75515.80   Max.   :135656.59    
##  vst.variance.standardized vst.variable   
##  Min.   : 0.0000           Mode :logical  
##  1st Qu.: 0.0000           FALSE:26392    
##  Median : 0.8621           TRUE :1606     
##  Mean   : 0.7906                          
##  3rd Qu.: 0.9670                          
##  Max.   :33.7765
#
PG_all <- ScaleData(PG_all, vars.to.regress=c('nCount_RNA','percent.mt'))
PG_all <- RunPCA(PG_all, features = PG_all@assays$RNA@var.features, verbose = TRUE, 
                 ndims.print=1:5, nfeatures.print = 5)
PCAPlot(PG_all, dims = c(1, 2))

# 
DimHeatmap(PG_all, dims = 1:12, cells = 500, balanced = TRUE)

#
PG_all <- FindNeighbors(PG_all, dims = 1:12)
PG_all <- FindClusters(PG_all, resolution=c(0.2,0.3,0.4,0.5))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9833
## Number of edges: 323440
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9244
## Number of communities: 10
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9833
## Number of edges: 323440
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9059
## Number of communities: 11
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9833
## Number of edges: 323440
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8921
## Number of communities: 12
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9833
## Number of edges: 323440
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8794
## Number of communities: 12
## Elapsed time: 1 seconds
PG_all <- RunTSNE(PG_all, dims = 1:12, do.fast=TRUE)
Idents(object = PG_all) <- PG_all$RNA_snn_res.0.2
DimPlot(PG_all, reduction = "tsne", label = TRUE, pt.size = 1, label.size = 6)

PCAPlot(PG_all)

####
####
#### retrieve non-epithelial cells
FeaturePlot(PG_all, features =c('Cd74','Cd72','Cd54') , cols =c('yellow','red'), 
            reduction ='tsne', pt.size = 0.7,ncol = 3, label = T)
## Warning in FetchData(object = object, vars = c(dims, "ident", features), : The
## following requested variables were not found: Cd54

FeaturePlot(PG_all, features = c('Eng','S1pr1','Emcn'), cols =c('yellow','red'), 
            reduction = 'tsne',pt.size = 0.7,ncol=3, label = T)

#
#Immu.Cluster <- FindMarkers(PG_all, ident.1='9', only.pos=TRUE, min.pct=0.25)
#Endo.Cluster <- FindMarkers(PG_all, ident.1='4', only.pos=TRUE, min.pct=0.25)
#Immu.Cluster$Symbol <- rownames(Immu.Cluster)
#Endo.Cluster$Symbol <- rownames(Endo.Cluster)
# 
v1 <- VlnPlot(PG_all, features = c("Epcam","Cdh1","Krt5", "Krt14"), ncol = 4) #Epithelial markers
v2 <- VlnPlot(PG_all, features = c("Igfbp4","Fn1","Gng11"), ncol = 3) # stromal markers
v3 <- VlnPlot(PG_all, features = c("Eng","S1pr1","Emcn"), ncol = 3) # endothelial markers
v4 <- VlnPlot(PG_all, features = c("Cd74","Cd72"), ncol = 2) #immune markers
ggarrange(v1, v2, v3, v4, ncol = 1)

#
###
### Draw heatmap
### 
library(ComplexHeatmap)
library(circlize)
#
geneSets                 <- c('Cd74','Cd72','Cd54','Eng','S1pr1','Emcn')
cellRanks                <- PG_all@meta.data[order(PG_all@meta.data$RNA_snn_res.0.2),]
PartialMatrix            <- PG_all@assays$RNA@scale.data[which(rownames(PG_all@assays$RNA@scale.data) %in% geneSets), rownames(cellRanks)]
cellRanks$col            <- rainbow(max(as.numeric(cellRanks$RNA_snn_res.0.2))+1)[as.numeric(cellRanks$RNA_snn_res.0.5)+1]
#
ha_column <- HeatmapAnnotation(
  df  = data.frame(
    ClusterID = as.numeric(cellRanks$RNA_snn_res.0.2)
  ),
  col = list(
    ClusterID = colorRamp2(unique(as.numeric(cellRanks$RNA_snn_res.0.2)), 
                           rainbow(max(as.numeric(cellRanks$RNA_snn_res.0.2))))
  )
)
ht1 = Heatmap(PartialMatrix, name = "Scaled\nUMI", column_title = "non-epithelial cells markers", 
              top_annotation = ha_column, show_column_names=FALSE, cluster_columns = FALSE,
              row_names_gp = gpar(fontsize = 10))
draw(ht1)

# seurat function
### heatmap for non-epithelial cells markers
geneSets                 <- c('Cd74','Cd72','Cd54','Eng','S1pr1','Emcn')
DoHeatmap(PG_all, features = geneSets)
## Warning in DoHeatmap(PG_all, features = geneSets): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: Cd54,
## Cd72

################################################################################
### Step02 clustering using Seurat #############################################
################################################################################
#
PG_all <- subset(PG_all, idents = c("4","8"), invert = TRUE)
DefaultAssay(PG_all) <- "RNA"
PG_all <- NormalizeData(PG_all, normalization.method='LogNormalize', scale.factor=1e4)
PG_all <- FindVariableFeatures(PG_all, mean.function = ExpMean, 
                               dispersion.function= LogVMR,
                               mean.cutoff = c(0.0125, 3),
                               dispersion.cutoff = c(0.5, Inf))
length(PG_all@assays$RNA@var.features)
## [1] 2000
#
PG_all <- ScaleData(PG_all, vars.to.regress=c('nCount_RNA','percent.mt'))
PG_all <- RunPCA(PG_all, features = PG_all@assays$RNA@var.features, verbose = TRUE, 
                 ndims.print=1:5, nfeatures.print = 5)
PG_all <- FindNeighbors(PG_all, dims = 1:12)
PG_all <- FindClusters(PG_all, resolution=c(0.2,0.3,0.4,0.5))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9316
## Number of edges: 302726
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9165
## Number of communities: 7
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9316
## Number of edges: 302726
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8963
## Number of communities: 10
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9316
## Number of edges: 302726
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8814
## Number of communities: 10
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9316
## Number of edges: 302726
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8660
## Number of communities: 10
## Elapsed time: 0 seconds
PG_all <- RunTSNE(PG_all, dims = 1:12, do.fast=TRUE)
#saveRDS(PG_all, file = "PG_all.RDS")
################################################
PG_all <- readRDS("PG_all.RDS")
Idents(object = PG_all) <- PG_all$RNA_snn_res.0.5
DimPlot(PG_all, label = T, pt.size = 1, label.size = 6)

plot <- VlnPlot(PG_all, features = c("Krt5","Krt14"), combine = FALSE, 
                fill.by = c("feature","ident")) 
plot <-  lapply(
  X = plot,
  FUN = function(p) p + aes(color= PG_all$RNA_snn_res.0.5)
)
CombinePlots(plots = plot, legend = 'none')
## Warning: CombinePlots is being deprecated. Plots should now be combined using
## the patchwork system.

################################################
# Doheatmap
listsA <- c('Cdh1','Epcam','Cldn1','Ocln','Vim','Fn1','S100a4','Zeb1','Zeb2',
            'Twist1','Twist2','Snai1','Snai2','Prrx1','Prrx2','Foxc2','Cdh2','Acta2')
# Option 1
DoHeatmap(PG_all, features = listsA)
## Warning in DoHeatmap(PG_all, features = listsA): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: Foxc2,
## Snai1, Twist1, Zeb1, Ocln, Cdh1

# Option 2: customise
DoHeatmap(PG_all, features = listsA, disp.min = -1,
          slot = 'scale.data', 
          size = 3,
          group.colors = rainbow(9)) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red",midpoint = 1)
## Warning in DoHeatmap(PG_all, features = listsA, disp.min = -1, slot =
## "scale.data", : The following features were omitted as they were not found in
## the scale.data slot for the RNA assay: Foxc2, Snai1, Twist1, Zeb1, Ocln, Cdh1
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

#cluster 7 showed both epithelial and stromal markers
################################################################################
# Step 03 DEGs and GSEA analysis
################################################################################
dev.off()
## null device 
##           1
PG.markers.7 <- FindMarkers(PG_all, ident.1 = "7", min.pct = 0.20)
PG.markers.7$gene <- rownames(PG.markers.7)
library(biomaRt)
gene_id <- getBM(attributes = c("ensembl_gene_id","external_gene_name","entrezgene_id"),
                 mart = useDataset("mmusculus_gene_ensembl", useMart("ensembl")))
PG.markers.7 <- merge(PG.markers.7, gene_id[,c(2,3)], by.x = "gene", by.y = "external_gene_name")
PG.markers.7.filt <- PG.markers.7
PG.markers.7.filt <- PG.markers.7.filt[!duplicated(PG.markers.7.filt$entrezgene_id),]
#PG.markers.7.filt <- PG.markers.7[which(PG.markers.7$p_val_adj < 0.05),]
genelist <- PG.markers.7$avg_log2FC
names(genelist) <-  PG.markers.7$entrezgene_id
genelist <- sort(genelist, decreasing = TRUE)

library(fgsea)
library(msigdbr)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(ggplot2)
m_df <- msigdbr(species = "Mus musculus", category = "C2") %>% dplyr::select(gs_name, entrez_gene)
m_df_fgsea <- split(x = m_df$entrez_gene, f = m_df$gs_name)
Zhang_BasHi <- read.csv("Zhang_BasHi.csv", header = FALSE, fileEncoding="UTF-8-BOM") # add other cell markers from the publication
#Zhang_LumHi <- read.csv("Zhang_LumHi.csv") # add other cell markers from the publication
Zhang_BasHi <- data.frame(Zhang_BasHi)
colnames(Zhang_BasHi)[1] <- "gene"
library(tidyverse)
## Registered S3 method overwritten by 'cli':
##   method     from         
##   print.boxx spatstat.geom
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.3     v purrr   0.3.4
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x data.table::between() masks dplyr::between()
## x tidyr::expand()       masks Matrix::expand()
## x dplyr::filter()       masks stats::filter()
## x data.table::first()   masks dplyr::first()
## x dplyr::lag()          masks stats::lag()
## x data.table::last()    masks dplyr::last()
## x tidyr::pack()         masks Matrix::pack()
## x biomaRt::select()     masks dplyr::select()
## x purrr::transpose()    masks data.table::transpose()
## x tidyr::unpack()       masks Matrix::unpack()
Zhang_BasHi$gene <- str_to_sentence(Zhang_BasHi$gene)
colnames(Zhang_BasHi)[1] <- "gene"
Zhang_BasHi <- merge(Zhang_BasHi, gene_id[,c(2,3)],by.x = "gene", by.y = "external_gene_name")
Zhang_BasHi <- Zhang_BasHi[,2]
m_df_fgsea[["Zhang_BasHi"]] <- Zhang_BasHi
  
fgseaRes <- fgsea(pathways = m_df_fgsea, 
                  stats    = genelist,
                  eps      = 0.05,
                  minSize  = 15,
                  maxSize  = 800)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.37% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Warning in fgseaMultilevel(...): There were 2 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values. For such pathways pval, padj, NES, log2err are set to NA. You
## can try to increase the value of the argument nPermSimple (for example set it
## nPermSimple = 10000)
## Warning in fgseaMultilevel(...): For some of the pathways the P-values were
## likely overestimated. For such pathways log2err is set to NA.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 0.05. You can set the `eps` argument to zero for better estimation.
selected <- fgseaRes[c(979, 329, 443, 445, 907),]
selected <- selected$pathway
plotGseaTable(m_df_fgsea[selected], genelist, fgseaRes, 
              gseaParam=0.5)
#### Pathview ####
dev.off()
## null device 
##           1
library(pathview)
## 
## ##############################################################################
## Pathview is an open source software package distributed under GNU General
## Public License version 3 (GPLv3). Details of GPLv3 is available at
## http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
## formally cite the original Pathview paper (not just mention it) in publications
## or products. For details, do citation("pathview") within R.
## 
## The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
## license agreement (details at http://www.kegg.jp/kegg/legal.html).
## ##############################################################################
dme <- pathview(gene.data= genelist, pathway.id="mmu04310", species = "mmu")
## Loading required namespace: org.Mm.eg.db
## 
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory C:/Users/lianc/Documents/R_HelenHeZhu_StemCell
## Info: Writing image file mmu04310.pathview.png
knitr::include_graphics("mmu04310.pathview.png")

################################################################################
#### Step 04 Trajectory analysis ########
################################################################################

library(monocle3)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
## 
##     rowMedians
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following objects are masked from 'package:data.table':
## 
##     first, second
## The following objects are masked from 'package:Matrix':
## 
##     expand, unname
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:purrr':
## 
##     reduce
## The following object is masked from 'package:data.table':
## 
##     shift
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
## The following object is masked from 'package:Seurat':
## 
##     Assays
## 
## Attaching package: 'monocle3'
## The following objects are masked from 'package:Biobase':
## 
##     exprs, fData, fData<-, pData, pData<-
library(dplyr)
library(SeuratWrappers)
library(ggplot2)
data <- GetAssayData(PG_all)[VariableFeatures(PG_all),]
pd <- data.frame(PG_all@meta.data)
fData <- data.frame(gene_short_name = rownames(data), row.names = row.names(data))
cds <- new_cell_data_set(expression_data = data,
                         cell_metadata = pd,
                         gene_metadata = fData)
cds <- preprocess_cds(cds, num_dim = 12)
#plot_pc_variance_explained(cds)
cds <- reduce_dimension(cds,
                        preprocess_method = "PCA",
                        reduction_method= "tSNE",
                        verbose = T)
## Reduce dimension by tSNE ...
cds <- cluster_cells(cds = cds , reduction_method = "tSNE", verbose = TRUE,
                     resolution = NULL)
## Running leiden clustering algorithm ...
## Run kNN based graph clustering starts:
##   -Input data of 9316 rows and 2 columns
##   -k is set to 20
##   Finding nearest neighbors...
##     DONE. Run time: 0.0400000000000205s
##   Compute jaccard coefficient between nearest-neighbor sets ...
##     DONE. Run time: 0s
##   Build undirected graph from the weighted links ...
##     DONE. Run time: 0.0799999999999841s
##   Run leiden clustering ...
## leiden_find_partition: partition_type       CPMVertexPartition
## leiden_find_partition: seed                 1778156675
## leiden_find_partition: resolution_parameter 1e-04
## leiden_find_partition: num_iter             2
## leiden_find_partition: initial_membership   0
## leiden_find_partition: edge_weights         0
## leiden_find_partition: node_sizes           0
## leiden_find_partition: number vertices      9316
## leiden_find_partition: number edges         186320
##     Current resolution is 1e-04; Modularity is 0.721961664040301; Quality is 368516.3332; Significance is 137204.426592028; Number of clusters is 7
##     Done. Run time: 0.172152042388916s
##   Clustering statistics
##    resolution_parameter  quality modularity significance cluster_count selected
##                   1e-04 368516.3  0.7219617     137204.4             7        *
## 
##   Cell counts by cluster
##    cluster cell_count cell_fraction
##          1       3931         0.422
##          2       1803         0.194
##          3       1566         0.168
##          4       1552         0.167
##          5        399         0.043
##          6         37         0.004
##          7         28         0.003
## 
##   Maximal modularity is 0.721961664040301 for resolution parameter 1e-04
## 
##   Run kNN based graph clustering DONE.
##   -Number of clusters: 7
p1 <- plot_cells(cds, reduction_method = "tSNE")
## No trajectory to plot. Has learn_graph() been called yet?
#compare with seurat data, res = 0.5
cds@clusters$tSNE$clusters <- PG_all$RNA_snn_res.0.5
p2 <- plot_cells(cds, reduction_method = "tSNE")
## No trajectory to plot. Has learn_graph() been called yet?
p1 + p2

# learn graph using UMAP reduction method
cds <- reduce_dimension(cds,
                        reduction_method= "UMAP",
                        verbose = T)
## No preprocess_method specified, using preprocess_method = 'PCA'
## Running Uniform Manifold Approximation and Projection
## 12:47:33 UMAP embedding parameters a = 1.577 b = 0.8951
## 12:47:33 Read 9316 rows and found 12 numeric columns
## 12:47:33 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 12:47:34 Writing NN index file to temp file C:\Users\lianc\AppData\Local\Temp\RtmpCQR6tR\file2d744c332422
## 12:47:34 Searching Annoy index using 1 thread, search_k = 1500
## 12:47:35 Annoy recall = 100%
## 12:47:36 Commencing smooth kNN distance calibration using 1 thread
## 12:47:37 Initializing from normalized Laplacian + noise
## 12:47:38 Commencing optimization for 500 epochs, with 192142 positive edges
## 12:47:53 Optimization finished
cds <- cluster_cells(cds = cds , reduction_method = "UMAP", verbose = TRUE)
## Running leiden clustering algorithm ...
## Run kNN based graph clustering starts:
##   -Input data of 9316 rows and 2 columns
##   -k is set to 20
##   Finding nearest neighbors...
##     DONE. Run time: 0.0300000000000296s
##   Compute jaccard coefficient between nearest-neighbor sets ...
##     DONE. Run time: 0.00999999999999091s
##   Build undirected graph from the weighted links ...
##     DONE. Run time: 0.0300000000000296s
##   Run leiden clustering ...
## leiden_find_partition: partition_type       CPMVertexPartition
## leiden_find_partition: seed                 726503898
## leiden_find_partition: resolution_parameter 1e-04
## leiden_find_partition: num_iter             2
## leiden_find_partition: initial_membership   0
## leiden_find_partition: edge_weights         0
## leiden_find_partition: node_sizes           0
## leiden_find_partition: number vertices      9316
## leiden_find_partition: number edges         186320
##     Current resolution is 1e-04; Modularity is 0.765461199534068; Quality is 368677.3964; Significance is 156256.528575102; Number of clusters is 8
##     Done. Run time: 0.0795819759368896s
## 
##   Clustering statistics
##    resolution_parameter  quality modularity significance cluster_count selected
##                   1e-04 368677.4  0.7654612     156256.5             8        *
## 
##   Cell counts by cluster
##    cluster cell_count cell_fraction
##          1       2888         0.310
##          2       2329         0.250
##          3       2044         0.219
##          4       1309         0.141
##          5        400         0.043
##          6        277         0.030
##          7         39         0.004
##          8         30         0.003
## 
##   Maximal modularity is 0.765461199534068 for resolution parameter 1e-04
## 
##   Run kNN based graph clustering DONE.
##   -Number of clusters: 8
plot_cells(cds, reduction_method = "UMAP", show_trajectory_graph = FALSE, group_label_size = 5)

cds <- learn_graph(cds, use_partition = TRUE)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## Warning in min(data_df$weight[data_df$weight > 0]): no non-missing arguments to
## min; returning Inf
# save pre-selected cluster 7 cells as the root cells
pre_selected_cells <- colnames(subset(PG_all, idents = "7"))
cds <- order_cells(cds, reduction_method = "UMAP", root_cells = pre_selected_cells)
# plot trajectories colored by pseudotime
plot_cells(
  cds = cds,
  color_cells_by = "pseudotime",
  show_trajectory_graph = TRUE,
  group_label_size = 5,
  graph_label_size = 3
)
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## Please use `select()` instead.
## Cells aren't colored in a way that allows them to be grouped.

cds@clusters$UMAP$clusters <- PG_all$RNA_snn_res.0.5
plot_cells(
  cds = cds,
  color_cells_by = "pseudotime",
  show_trajectory_graph = F,
  group_label_size = 5,
  graph_label_size = 3
)
## Cells aren't colored in a way that allows them to be grouped.

plot_cells(
  cds = cds,
  color_cells_by = "cluster",
  show_trajectory_graph = FALSE,
  group_label_size = 5,
  graph_label_size = 3
)

plot_cells(
  cds = cds,
  color_cells_by = "cluster",
  group_label_size = 5,
  graph_label_size = 4,
  label_leaves = F,
  label_branch_points = F,
  show_trajectory_graph = TRUE,
  scale_to_range = T
)

# Add pseudotime value to the seurat object
PG_all <- AddMetaData(object = PG_all, 
                      metadata = cds@principal_graph_aux@listData$UMAP$pseudotime,
                      col.name =  "pseudotime")
FeaturePlot(PG_all, "pseudotime", reduction = "tsne", label = T, label.size = 8) + scale_color_viridis_c()
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

F1 <- FeaturePlot(PG_all, "pseudotime", reduction = "pca", label = T, label.size = 6, repel = T) + scale_color_viridis_c()
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
F2 <- DimPlot(PG_all, reduction = "pca", group.by = "seurat_clusters", label = T, repel = T)
ggarrange(F1, F2, ncol = 2)
## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

################################################################################
#### Diffusion Map ####
################################################################################
# package 'destiny' is not available for Bioconductor version '3.13'
################################################################################
#### Slingshot ####
################################################################################
library(slingshot)
## Loading required package: princurve
## Loading required package: TrajectoryUtils
pd <- data.frame(PG_all@meta.data)
fData <- data.frame(gene_short_name = rownames(data), row.names = row.names(data))
counts <- GetAssayData(PG_all)[VariableFeatures(PG_all),]
sce <- SingleCellExperiment(list(counts = counts),
                            metadata = pd)


# use PCA
reducedDims(sce) <- list(pca = PG_all@reductions$pca@cell.embeddings)
reducedDims(sce)$pca <- reducedDims(sce)$pca[, 1:12]
sce
## class: SingleCellExperiment 
## dim: 2000 9316 
## metadata(10): orig.ident nCount_RNA ... seurat_clusters pseudotime
## assays(1): counts
## rownames(2000): Acta2 Ccl21a ... 1700097N02Rik Gm10556
## rowData names(0):
## colnames(9316): AAACCTGAGTTACGGG-1 AAACCTGCAAGGTTTC-1 ...
##   TTTGTCATCTCTAAGG-1 TTTGTCATCTTACCGC-1
## colData names(0):
## reducedDimNames(1): pca
## mainExpName: NULL
## altExpNames(0):
sce <- slingshot(sce,
                 reducedDim = "pca",
                 clusterLabels = sce@metadata$RNA_snn_res.0.5,
                 start.clus = 7)
sds <- SlingshotDataSet(sce)
sds  # has 5 or 6 lineages
## class: SlingshotDataSet 
## 
##  Samples Dimensions
##     9316         12
## 
## lineages: 6 
## Lineage1: 7  0  5  2  8  
## Lineage2: 7  0  5  4  
## Lineage3: 7  0  5  6  
## Lineage4: 7  0  3  
## Lineage5: 7  0  1  
## Lineage6: 7  9  
## 
## curves: 6 
## Curve1: Length: 179.94   Samples: 4394.44
## Curve2: Length: 136.35   Samples: 3823.89
## Curve3: Length: 239.65   Samples: 3965.85
## Curve4: Length: 124.63   Samples: 3863.12
## Curve5: Length: 123.43   Samples: 4639.47
## Curve6: Length: 288.31   Samples: 444.83
##########################################################################
############## visualization for pca results ##############################
#######################################################################
# Plot clusters with identified lineages overlayed
rd <- data.frame(reducedDim(sce))
cl <- unlist(sce@metadata$RNA_snn_res.0.5)
names(cl) <- rownames(PG_all@meta.data)

par(xpd=TRUE)
par(mar=c(4.5,5.5,2,7))
plot(rd[,1:2], col = rainbow(10)[cl], asp = 0.5, pch = 20)
lines(SlingshotDataSet(sce),  lwd = 2, col = 'black')
legend(x = 30, y = 50, legend= order(unique(cl)), 
       fill=rainbow(10)[order(unique(cl))])

# Add into Seurat object
PG_all$slingPseudotime_1 <- sce$slingPseudotime_1
PG_all$slingPseudotime_2 <- sce$slingPseudotime_2
PG_all$slingPseudotime_3 <- sce$slingPseudotime_3
PG_all$slingPseudotime_4 <- sce$slingPseudotime_4
PG_all$slingPseudotime_5 <- sce$slingPseudotime_5
PG_all$slingPseudotime_6 <- sce$slingPseudotime_6
 
FeaturePlot(PG_all, c("slingPseudotime_1","slingPseudotime_2","slingPseudotime_3",
                      "slingPseudotime_4","slingPseudotime_5","slingPseudotime_6"), 
            label = T, label.size = 5, cols = rainbow(12))

##########################################################################
### Visualize pseudotime using pca data
##########################################################################
## scatter3d plot, plotly 
# refer to https://github.com/Dragonmasterx87/Interactive-3D-Plotting-in-Seurat-3.0.0/blob/master/3D%20UMAP%20Plotting%20v1.3.R
library(rgl)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:biomaRt':
## 
##     select
## The following object is masked from 'package:ComplexHeatmap':
## 
##     add_heatmap
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
#Embeddings(object = PG_all, reduction = "pca")
#summary(Embeddings(object = PG_all, reduction = "pca"))
plot.data <- FetchData(object = PG_all, vars = c("PC_1", "PC_2", "PC_3", "seurat_clusters", "slingPseudotime_1","slingPseudotime_2","slingPseudotime_3","slingPseudotime_4","slingPseudotime_5","slingPseudotime_6"))
plot.data$label <- paste(rownames(plot.data))
fig <- plot_ly(data = plot.data, 
               x = ~PC_1, y = ~PC_2, z = ~PC_3, 
               color = ~seurat_clusters, 
               colors = rainbow(10),
               type = "scatter3d", 
               mode = "markers", 
               marker = list(size = 2, width=2), # controls size of points
               text=~seurat_clusters, #This is that extra column we made earlier for which we will use for cell ID
               hoverinfo="text") #When you visualize your plotly object, hovering your mouse pointer over a point shows cell names
               
fig <- fig %>% layout(scene = list(xaxis= list(title = "PC_1"),
                                   yaxis= list(title = "PC_2"),
                                   zaxis= list(title = "PC_3")))
fig 
## Failed to add lineage
# <- fig %>% add_trace(type = 'scatter3d', data = plot.data,
               #          x = ~slingPseudotime_1, y = ~slingPseudotime_2, y = ~slingPseudotime_3,
                #         line = list(color = 'black', width = 1))

#fig
##########################################################################
### Helen's script for the scatter3d plot
###########################################################################
Dims.pca          <- sds@reducedDim[,1:3]         # pca dimension
clusterLabels.pca <- sds@clusterLabels        # cluster labels
connectivity.pca  <- sds@adjacency         # 
clusters.pca      <- rownames(connectivity.pca)       #
nclus.pca         <- nrow(connectivity.pca)          #
centers.pca       <- t(sapply(clusters.pca,function(x){
  x.sub <- Dims.pca[rownames(clusterLabels.pca[which(clusterLabels.pca[, x] == "1"),]),]
  return(colMeans(x.sub))
}))    
rownames(centers.pca) <- clusters.pca                                              # add row names
#Dims.pca              <- Dims.pca[ clusterLabels.pca %in% clusters.pca, ]          # 
#clusterLabels.pca     <- clusterLabels.pca[clusterLabels.pca %in% clusters.pca]
#
sds@lineages
## $Lineage1
## [1] "7" "0" "5" "2" "8"
## 
## $Lineage2
## [1] "7" "0" "5" "4"
## 
## $Lineage3
## [1] "7" "0" "5" "6"
## 
## $Lineage4
## [1] "7" "0" "3"
## 
## $Lineage5
## [1] "7" "0" "1"
## 
## $Lineage6
## [1] "7" "9"
xs <- c(NULL, centers.pca[, 1 ])
ys <- c(NULL, centers.pca[, 2 ])
zs <- c(NULL, centers.pca[, 3 ])
xs <- c( xs, as.numeric(sapply( sds@curves, function(c){     c$s[, 1 ] }) ))
ys <- c( ys, as.numeric(sapply( sds@curves, function(c){     c$s[, 2 ] }) ))
zs <- c( zs, as.numeric(sapply( sds@curves, function(c){     c$s[, 3 ] }) ))

# straight line 3d plot
rgl::plot3d(x = NULL, y = NULL, z = NULL, aspect = 'iso', 
            xlim = range(xs), ylim = range(ys), zlim = range(zs), 
            box=FALSE, axes=FALSE, xlab = '', ylab = '', zlab = '' )
colpal = rainbow(11)
rgl::plot3d(Dims.pca, col= rainbow(10)[cl], 
             add=TRUE, type='p', size=4, pch=20,alpha=I(1/8), 
            box=FALSE, axes=FALSE)
rgl::abclines3d(max(Dims.pca[,1]),max(Dims.pca[,2]),max(Dims.pca[,3]), 
                a = diag(3), col = "black", lwd=2)
rgl::plot3d(centers.pca, size = 10, add = TRUE, pch = 17,
            col = colpal[as.numeric(rownames(centers.pca))+1], alpha=1)
for (i in 1:(nclus.pca-1)){
  for (j in (i+1):nclus.pca){
    if (connectivity.pca[i,j]==1){
      rgl::lines3d(x=centers.pca[c(i,j),1], y=centers.pca[c(i,j),2], 
                   z=centers.pca[c(i,j),3], 
                   col=colpal[as.numeric(rownames(connectivity.pca))+1], 
                   lwd=2)
    }
  }
}
rgl::rgl.snapshot('Slingshot.Cluster9.branchs.straightLine2.png', fmt='png',top=TRUE)
##############################################################################
#### draw multiple lineage inference ###
#############################################################################
## order the cells on each branch
library(dplyr)
for (i in 1:ncol(slingPseudotime(sds))){
  linedf <- data.frame(pseudotime=slingPseudotime(sds)[,i], 
                       clus.labels = cl, 
                       samples=rownames(slingPseudotime(sds)))
  linedf <- linedf[order(linedf$pseudotime),]
  medoids <- sapply(levels(linedf$clus.labels), function(clID){
    x.sub <- linedf %>% filter(linedf$clus.labels == clID) %>% dplyr::select(pseudotime)
    col <- colpal[as.numeric(as.character(linedf$clus.labels))+1][which.max(linedf$clus.labels == clID)]
    return( list(means=mean(x.sub$pseudotime, na.rm=TRUE), sdev= sd(x.sub$pseudotime, na.rm=TRUE),col=col) )
  })

  means = unlist(medoids$means)
  sdev  = unlist(medoids$sdev)
  col   = unlist(medoids$col)  
  #pdf(paste('2021.08.25.Slingshot.Cluster09.Pseudo.branch_',i,'.pdf',sep=''), 
   #   width=6,height=3)
  plot(linedf$pseudotime,rep(0, length(linedf$pseudotime)),cex=3,axes=F, 
       pch=16, xlab='', ylab='', 
       col=colpal[as.numeric(as.character(linedf$clus.labels))+1], 
       ylim=c(-0.1, 0.1), xlim = range(linedf$pseudotime, na.rm=TRUE)); 
  abline(h=0, col="black")
  points(x=means,y=rep(0.07, length(means)), col=col, pch=19)
  arrows(means-sdev, rep(0.07, length(means)), means+sdev, rep(0.07, length(means)), 
         length=0.05, angle=90, code=3, col=col)
  #dev.off()
}